# finaquant Financial Analytics - www.finaquant.com
# Copyright: Tunc Ali Ktkcoglu  2012, version: 8Mar2012
# TITLE: Simple Time Estimation Model with Linear Regression (LR) in Action
# Related web page:
# http://finaquant.com/predictive-modelling-with-linear-regression/520

# Predictive model for estimating the time record of an athlete (in seconds) 
# running a 100-meter distance against the wind (speed in km/h) with a given 
# weight (in kg) attached to his belt. 

# y = run_time_record(x1, x2, t):
# Hypothetical function that describes assumed physical relationships.
# The resultant time record (y) is a function of three parameters:
# - x1: wind speed (km/h)
# - x2: weight (kg)
# - t: temperature (Celcius); optimal running temperature is 20 C
run_time_record = function(x1,x2,t){
y = 12 + (0.1*x1) + (0.0005* x1^2) + (0.15*x2) + (0.003*x2^2) + 0.005*abs(t-20)^2
return(y)
}
# The parameter "temperature" which is not captured by the estimation model
# represents generally all kinds of unknown factors affecting the outcome.
# In that sense, this parameter also represents the uncertainty and error
# in estimations based on input parameters x2 and/or x1.

# help function for displaying variables
display_variable = function(v, vname) { 
print(paste(vname,' ='),quote = FALSE)
print(v,quote = FALSE)
}

# Surface plot of the physical function with a constant weight
# wind speed range: 0-40 km/h, temperature range: 0-30 degrees celcius 
X1 = seq(0, 40, 0.2); # wind speed
T = seq(0, 30, 0.2);  # temperature
observed_time_records_for_constant_weight = function(x1,t) {
timerecord = run_time_record(x1,1,t)
return(timerecord)}
Yo = outer(X1,T,observed_time_records_for_constant_weight)
persp(X1,T,Yo,main='Physical Function')
dev.new() 

# Number of data sets N and M for training and test data 
N = 200
M = 100

print('Simple Time Estimation Model with Linear Regression (LR) in Action')

# Generate historical data with N+M observations
print('Generate TRAINING data')
x1 = 50 * runif(N)    # wind speed in km/h
x2 = 4 * runif(N)     # weight in kg
t = 10 + 20 * runif(N)    # temperature in C
y = run_time_record(x1, x2, t)

# construct matrices
X_train = rbind(matrix(1,1,N), x1, x2)
Y_train = t(matrix(y,1,N))

print('Generate TEST data')
x1 = 50 * runif(M)    # wind speed in km/h
x2 = 4 * runif(M)     # weight in kg
t = 10 + 20 * runif(M)    # temperature in C
y = run_time_record(x1, x2, t);

# construct matrices
X_test = rbind(matrix(1,1,M), x1, x2)
Y_test = t(matrix(y,1,M))

print('******************************************************************');
print('CASE 1: Single parameter time estimation with wind speed (x1)');
print('******************************************************************');

# first two rows only
X_train1 = X_train[1:2,] 
X_test1 = X_test[1:2,] 

print('Find optimal coefficient vector B with training data')
Bopt = solve(X_train1 %*% t(X_train1)) %*% X_train1 %*% Y_train
display_variable(Bopt,'Bopt')

print('Calculate estimated time records')
Ye_train = t(X_train1) %*% Bopt
Ye_test = t(X_test1) %*% Bopt

print('Calculate training and test error');
E_train = Y_train - Ye_train
SSE_train = t(E_train) %*% E_train
MSE_train = SSE_train / N
display_variable(MSE_train,'MSE_train')

E_test = Y_test - Ye_test;
SSE_test = t(E_test) %*% E_test
MSE_test = SSE_test / M
display_variable(MSE_test,'MSE_test')

print('Show curve fitting (approximation) for test data')
res=sort(X_test[2,],index.return=TRUE) 
x1 = res$x
ind = res$ix
plot(x1, Y_test[ind], col="blue",xlab='wind speed (km/h)',ylab='time record',main='TEST data: Observed (blue dots) vs estimated (red line) time records')
lines(x1,Ye_test[ind],col="red")
dev.new()

print('******************************************************************')
print('CASE 2: Single parameter time estimation with 2nd degree polynomial regression')
print('******************************************************************')

# 2nd degree polynomial approximation
# ye = b0 + b1 x x1 + b2 x x1^2
x1 = X_train[2,]
X_train2 = rbind(X_train[1:2,], x1^2)

x1 = X_test[2,]
X_test2 = rbind(X_test[1:2,], x1^2)

print('Find optimal coefficient vector B with training data');
Bopt = solve(X_train2 %*% t(X_train2)) %*% X_train2 %*% Y_train
display_variable(Bopt,'Bopt')

print('Calculate estimated time records');
Ye_train = t(X_train2) %*% Bopt
Ye_test = t(X_test2) %*% Bopt

print('Calculate training and test error');
E_train = Y_train - Ye_train
SSE_train = t(E_train) %*% E_train
MSE_train = SSE_train / N
display_variable(MSE_train,'MSE_train')

E_test = Y_test - Ye_test;
SSE_test = t(E_test) %*% E_test
MSE_test = SSE_test / M
display_variable(MSE_test,'MSE_test')

print('Show curve fitting (approximation) for test data')
res=sort(X_test[2,],index.return=TRUE) 
x1 = res$x
ind = res$ix
plot(x1, Y_test[ind], col="blue",xlab='wind speed (km/h)',ylab='time record',main='TEST data: Observed (blue dots) vs estimated (red line) time records')
lines(x1,Ye_test[ind],col="red")
dev.new()

print('******************************************************************')
print('CASE 3: Two parameter time estimation with wind speed (x1) and weight (x2)')
print('******************************************************************')

print('Find optimal coefficient vector B with training data')
Bopt = solve(X_train %*% t(X_train)) %*% X_train %*% Y_train
display_variable(Bopt,'Bopt')

print('Calculate estimated time records');
Ye_train = t(X_train) %*% Bopt
Ye_test = t(X_test) %*% Bopt

print('Calculate training and test error');
E_train = Y_train - Ye_train
SSE_train = t(E_train) %*% E_train
MSE_train = SSE_train / N
display_variable(MSE_train,'MSE_train')

E_test = Y_test - Ye_test;
SSE_test = t(E_test) %*% E_test
MSE_test = SSE_test / M
display_variable(MSE_test,'MSE_test')

print('Prepare data for surface plot')
# wind speed range: 0-40 km/h, weight range: 0-4 kg, 
X1 = seq(0, 40, 0.2); # wind speed
X2 = seq(0, 4, 0.1);  # weight

# define function to generate estimated time records
generate_estimated_time_records3 = function(x1,x2) {
timerecord = Bopt[1] + Bopt[2] * x1 + Bopt[3] * x2
return(timerecord)}

# define function to generate observed time records with random temperature
generate_observed_time_records = function(x1,x2) {
T = 10 + 20 * runif(1)
timerecord = run_time_record(x1,x2,T)
return(timerecord)}

# surface plot 
Ye = outer(X1,X2,generate_estimated_time_records3)
persp(X1,X2,Ye,main='Case 3: Time record estimation')
dev.new() 

Y = outer(X1,X2,generate_observed_time_records)
persp(X1,X2,Y,main='Case 3: Observed historical time records')
dev.new() 

print('******************************************************************')
print('CASE 4: Two parameter time estimation with wind speed (x1) and weight (x2)')
print('******************************************************************')

# 2nd degree polynomial approximation
# ye = b0 + b1 x x1 + b2 x x1^2 + b3 x x2 + b4 x x2^2
x1 = X_train[2,]
x2 = X_train[3,]
X_train4 = rbind(X_train[1:2,], x1^2, x2, x2^2)

x1 = X_test[2,]
x2 = X_test[3,]
X_test4 = rbind(X_test[1:2,], x1^2, x2, x2^2)

print('Find optimal coefficient vector B with training data')
Bopt = solve(X_train4 %*% t(X_train4)) %*% X_train4 %*% Y_train
display_variable(Bopt,'Bopt')

print('Calculate estimated time records');
Ye_train = t(X_train4) %*% Bopt
Ye_test = t(X_test4) %*% Bopt

print('Calculate training and test error');
E_train = Y_train - Ye_train
SSE_train = t(E_train) %*% E_train
MSE_train = SSE_train / N
display_variable(MSE_train,'MSE_train')

E_test = Y_test - Ye_test;
SSE_test = t(E_test) %*% E_test
MSE_test = SSE_test / M
display_variable(MSE_test,'MSE_test')

print('Prepare data for surface plot')
# wind speed range: 0-40 km/h, weight range: 0-4 kg, 
X1 = seq(0, 40, 0.2); # wind speed
X2 = seq(0, 4, 0.1);  # weight

# define function to generate estimated time records
generate_estimated_time_records4 = function(x1,x2) {
timerecord = Bopt[1] + Bopt[2] * x1 + Bopt[3] * x1^2 + Bopt[4] * x2 + Bopt[5] * x2^2
return(timerecord)}

# surface plot 
Ye = outer(X1,X2,generate_estimated_time_records4)
persp(X1,X2,Ye,main='Case 4: Time record estimation')
dev.new() 

Y = outer(X1,X2,generate_observed_time_records)
persp(X1,X2,Y,main='Case 4: Observed historical time records')
